{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Recommendation Systems using TensorFlow\n",
    "\n",
    "### Builds a recommendation engine using Movie Lens data \n",
    "We use the 1M data set for building our recommendation engine.\n",
    "The 1M data contain 1,000,209 anonymous ratings \n",
    "- Ratings for approximately *3,900 movies* \n",
    "- Ratings provided by *6,040 MovieLens users* who joined MovieLens in 2000\n",
    "\n",
    "More information about the data can be viewed at: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Imports for data io operations\n",
    "from collections import deque\n",
    "from six import next\n",
    "import readers\n",
    "\n",
    "# Main imports for training\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "\n",
    "# Evaluate train times per epoch\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Constant seed for replicating training results\n",
    "np.random.seed(42)\n",
    "\n",
    "u_num = 6040 # Number of users in the dataset\n",
    "i_num = 3952 # Number of movies in the dataset\n",
    "\n",
    "batch_size = 1000 # Number of samples per batch\n",
    "dims = 15         # Dimensions of the data, 15\n",
    "max_epochs = 25   # Number of times the network sees all the training data\n",
    "\n",
    "# Device used for all computations\n",
    "place_device = \"/cpu:0\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_data():\n",
    "    # Reads file using the demiliter :: form the ratings file\n",
    "    # Download movie lens data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip\n",
    "    # Columns are user ID, item ID, rating, and timestamp\n",
    "    # Sample data - 3::1196::4::978297539\n",
    "    df = readers.read_file(\"ratings.dat\", sep=\"::\")\n",
    "    rows = len(df)\n",
    "    # Purely integer-location based indexing for selection by position\n",
    "    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)\n",
    "    # Separate data into train and test, 90% for train and 10% for test\n",
    "    split_index = int(rows * 0.9)\n",
    "    # Use indices to separate the data\n",
    "    df_train = df[0:split_index]\n",
    "    df_test = df[split_index:].reset_index(drop=True)\n",
    "    \n",
    "    return df_train, df_test\n",
    "\n",
    "def clip(x):\n",
    "    return np.clip(x, 1.0, 5.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def model(user_batch, item_batch, user_num, item_num, dim=5, device=\"/cpu:0\"):\n",
    "    with tf.device(\"/cpu:0\"):\n",
    "        # Using a global bias term\n",
    "        bias_global = tf.get_variable(\"bias_global\", shape=[])\n",
    "        # User and item bias variables\n",
    "        # get_variable: Prefixes the name with the current variable scope \n",
    "        # and performs reuse checks.\n",
    "        w_bias_user = tf.get_variable(\"embd_bias_user\", shape=[user_num])\n",
    "        w_bias_item = tf.get_variable(\"embd_bias_item\", shape=[item_num])\n",
    "        # embedding_lookup: Looks up 'ids' in a list of embedding tensors\n",
    "        # Bias embeddings for user and items, given a batch\n",
    "        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name=\"bias_user\")\n",
    "        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name=\"bias_item\")\n",
    "        # User and item weight variables\n",
    "        w_user = tf.get_variable(\"embd_user\", shape=[user_num, dim],\n",
    "                                 initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
    "        w_item = tf.get_variable(\"embd_item\", shape=[item_num, dim],\n",
    "                                 initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
    "        # Weight embeddings for user and items, given a batch\n",
    "        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name=\"embedding_user\")\n",
    "        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name=\"embedding_item\")\n",
    "    \n",
    "    with tf.device(device):\n",
    "        # reduce_sum: Computes the sum of elements across dimensions of a tensor\n",
    "        infer = tf.reduce_sum(tf.mul(embd_user, embd_item), 1)\n",
    "        infer = tf.add(infer, bias_global)\n",
    "        infer = tf.add(infer, bias_user)\n",
    "        infer = tf.add(infer, bias_item, name=\"svd_inference\")\n",
    "        # l2_loss: Computes half the L2 norm of a tensor without the sqrt\n",
    "        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), \n",
    "                             name=\"svd_regularizer\")\n",
    "    return infer, regularizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def loss(infer, regularizer, rate_batch, learning_rate=0.1, reg=0.1, device=\"/cpu:0\"):\n",
    "    with tf.device(device):\n",
    "        # Use L2 loss to compute penalty\n",
    "        cost_l2 = tf.nn.l2_loss(tf.sub(infer, rate_batch))\n",
    "        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name=\"l2\")\n",
    "        cost = tf.add(cost_l2, tf.mul(regularizer, penalty))\n",
    "        # 'Follow the Regularized Leader' optimizer\n",
    "        # Reference: http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf\n",
    "        train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost)\n",
    "    return cost, train_op"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of train samples 900188, test samples 100021, samples per batch 900\n"
     ]
    }
   ],
   "source": [
    "# Read data from ratings file to build a TF model\n",
    "df_train, df_test = get_data()\n",
    "\n",
    "samples_per_batch = len(df_train) // batch_size\n",
    "print(\"Number of train samples %d, test samples %d, samples per batch %d\" % \n",
    "      (len(df_train), len(df_test), samples_per_batch))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    5411\n",
      "1    5439\n",
      "2     367\n",
      "3     424\n",
      "4    4941\n",
      "Name: user, dtype: int32\n",
      "0    1696\n",
      "1    5448\n",
      "2    2242\n",
      "3    5629\n",
      "4     423\n",
      "Name: user, dtype: int32\n"
     ]
    }
   ],
   "source": [
    "# Peeking at the top 5 user values\n",
    "print(df_train[\"user\"].head()) \n",
    "print(df_test[\"user\"].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    2682\n",
      "1     903\n",
      "2    3716\n",
      "3    1720\n",
      "4    3696\n",
      "Name: item, dtype: int32\n",
      "0    3113\n",
      "1    1195\n",
      "2     749\n",
      "3    3623\n",
      "4    2899\n",
      "Name: item, dtype: int32\n"
     ]
    }
   ],
   "source": [
    "# Peeking at the top 5 item values\n",
    "print(df_train[\"item\"].head())\n",
    "print(df_test[\"item\"].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    2.0\n",
      "1    5.0\n",
      "2    4.0\n",
      "3    4.0\n",
      "4    1.0\n",
      "Name: rate, dtype: float32\n",
      "0    5.0\n",
      "1    5.0\n",
      "2    5.0\n",
      "3    2.0\n",
      "4    2.0\n",
      "Name: rate, dtype: float32\n"
     ]
    }
   ],
   "source": [
    "# Peeking at the top 5 rate values\n",
    "print(df_train[\"rate\"].head())\n",
    "print(df_test[\"rate\"].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Using a shuffle iterator to generate random batches, for training\n",
    "iter_train = readers.ShuffleIterator([df_train[\"user\"],\n",
    "                                     df_train[\"item\"],\n",
    "                                     df_train[\"rate\"]],\n",
    "                                     batch_size=batch_size)\n",
    "\n",
    "# Sequentially generate one-epoch batches, for testing\n",
    "iter_test = readers.OneEpochIterator([df_test[\"user\"],\n",
    "                                     df_test[\"item\"],\n",
    "                                     df_test[\"rate\"]],\n",
    "                                     batch_size=-1)\n",
    "\n",
    "user_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_user\")\n",
    "item_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_item\")\n",
    "rate_batch = tf.placeholder(tf.float32, shape=[None])\n",
    "\n",
    "infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)\n",
    "_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.10, reg=0.05, device=place_device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch\tTrain Error\tVal Error\tElapsed Time\n",
      "00\t2.555\t\t2.528\t\t0.044 secs\n",
      "01\t1.080\t\t0.921\t\t2.027 secs\n",
      "02\t0.882\t\t0.895\t\t2.002 secs\n",
      "03\t0.861\t\t0.885\t\t1.998 secs\n",
      "04\t0.849\t\t0.879\t\t1.982 secs\n",
      "05\t0.839\t\t0.874\t\t2.009 secs\n",
      "06\t0.832\t\t0.870\t\t2.011 secs\n",
      "07\t0.825\t\t0.867\t\t1.997 secs\n",
      "08\t0.819\t\t0.864\t\t2.005 secs\n",
      "09\t0.813\t\t0.862\t\t2.014 secs\n",
      "10\t0.808\t\t0.860\t\t2.192 secs\n",
      "11\t0.804\t\t0.859\t\t2.188 secs\n",
      "12\t0.800\t\t0.858\t\t2.047 secs\n",
      "13\t0.796\t\t0.857\t\t1.979 secs\n",
      "14\t0.795\t\t0.856\t\t1.987 secs\n",
      "15\t0.791\t\t0.855\t\t2.015 secs\n",
      "16\t0.788\t\t0.854\t\t2.030 secs\n",
      "17\t0.785\t\t0.853\t\t2.005 secs\n",
      "18\t0.784\t\t0.853\t\t1.997 secs\n",
      "19\t0.781\t\t0.852\t\t1.975 secs\n",
      "20\t0.779\t\t0.852\t\t2.078 secs\n",
      "21\t0.777\t\t0.852\t\t2.020 secs\n",
      "22\t0.775\t\t0.851\t\t2.259 secs\n",
      "23\t0.774\t\t0.851\t\t2.370 secs\n",
      "24\t0.773\t\t0.851\t\t2.247 secs\n"
     ]
    }
   ],
   "source": [
    "saver = tf.train.Saver()\n",
    "init_op = tf.global_variables_initializer()\n",
    "\n",
    "with tf.Session() as sess:\n",
    "    sess.run(init_op)\n",
    "    print(\"%s\\t%s\\t%s\\t%s\" % (\"Epoch\", \"Train Error\", \"Val Error\", \"Elapsed Time\"))\n",
    "    errors = deque(maxlen=samples_per_batch)\n",
    "    start = time.time()\n",
    "    for i in range(max_epochs * samples_per_batch):\n",
    "        users, items, rates = next(iter_train)\n",
    "        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,\n",
    "                                                               item_batch: items,\n",
    "                                                               rate_batch: rates})\n",
    "        pred_batch = clip(pred_batch)\n",
    "        errors.append(np.power(pred_batch - rates, 2))\n",
    "        if i % samples_per_batch == 0:\n",
    "            train_err = np.sqrt(np.mean(errors))\n",
    "            test_err2 = np.array([])\n",
    "            for users, items, rates in iter_test:\n",
    "                pred_batch = sess.run(infer, feed_dict={user_batch: users,\n",
    "                                                        item_batch: items})\n",
    "                pred_batch = clip(pred_batch)\n",
    "                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))\n",
    "            end = time.time()\n",
    "            \n",
    "            print(\"%02d\\t%.3f\\t\\t%.3f\\t\\t%.3f secs\" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))\n",
    "            start = end\n",
    "\n",
    "    saver.save(sess, './save/model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pred\tActual\n",
      "4.982\t5.000\n",
      "4.486\t5.000\n",
      "5.000\t5.000\n",
      "4.031\t2.000\n",
      "2.908\t2.000\n",
      "4.500\t5.000\n",
      "2.892\t3.000\n",
      "3.943\t4.000\n",
      "1.485\t2.000\n",
      "4.104\t1.000\n",
      "0.850395450933\n"
     ]
    }
   ],
   "source": [
    "# Inference using saved model\n",
    "init_op = tf.global_variables_initializer()\n",
    "\n",
    "with tf.Session() as sess:\n",
    "    #sess.run(init_op)\n",
    "    new_saver = tf.train.import_meta_graph('./save/model.meta')\n",
    "    new_saver.restore(sess, tf.train.latest_checkpoint('./save/'))\n",
    "    test_err2 = np.array([])\n",
    "    for users, items, rates in iter_test:\n",
    "        pred_batch = sess.run(infer, feed_dict={user_batch: users,\n",
    "                                                item_batch: items})\n",
    "        pred_batch = clip(pred_batch)\n",
    "        print(\"Pred\\tActual\")\n",
    "        for ii in range(10):\n",
    "            print(\"%.3f\\t%.3f\" % (pred_batch[ii], rates[ii]))\n",
    "        test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))\n",
    "        print(np.sqrt(np.mean(test_err2)))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
